import pandas as pd
import sqlite3
from plotly import express as px
from sklearn.linear_model import LinearRegression
import plotly.io as pio
="iframe" pio.renderers.default
Hello! In this assignment, we’ll be wrangling some climate data gathered by stations in the National Oceanic and Atmospheric Association (NOAA) to produce some scatter plots for each stations average yearly increase in temperature. To do this, first we’ll import the relevant packages. We’ll need: - sqlite3: for database management - pandas: for working with DataFrames - plotly express: for geographic visualizations - scikit-learn: for linear regression
Step 1. Create a Database
We’ll begin by importing all our necessary files and throwing them all into our temps.db
database. There is a lot of data, so using SQL will ultimately be more storage-conscious and we’ll be able to query from our tables more easily.
if __name__ == "__main__":
# Read in CSV files
= pd.read_csv("temps_stacked.csv")
temps
temps.dropna()= pd.read_csv("countries.csv")
countries # editing whitespace and - to _
= countries.rename(columns={'FIPS 10-4': 'FIPS_10_4', 'ISO 3166': 'ISO_3166'})
countries = pd.read_csv("https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/noaa-ghcn/station-metadata.csv")
stations
# Open a connection to database
= sqlite3.connect("temps.db")
conn
# Adding the csv files
"temperatures", conn, if_exists = "replace", index = False)
temps.to_sql("countries", conn, if_exists = "replace", index = False)
countries.to_sql("stations", conn, if_exists = "replace", index = False)
stations.to_sql(
# Closing connection
conn.close()
Step 2. Create a query_climate_database
Function
This function will allow us to query our database (temps.db
) to get information for stations within a certain timeframe for a certain country. In particular, it requests: - the country of interest - the month of interest - the start year and end year (inclusive) - the database to query
and it returns: - a DataFrame containing the columns station name, station latitude, station longitude, country, year, month, and temperature (over the month).
def query_climate_database(db_file, country, year_begin, year_end, month):
'''
A function that queries a climate database, returning station information (name, lat, long)
and temperature data for a specific year frame and month, for a certain country.
Inputs: db_file (name of database), country (str), year_begin, year_end (int), month (int)
Returns: DataFrame
Example: query_climate_database("temps.db", 'India', 1980, 2020, 1)
'''
= sqlite3.connect(db_file)
conn
# Getting the country code
= \
cmd1 f"""
SELECT fips_10_4
FROM countries
WHERE name = '{country}'
"""
= pd.read_sql(cmd1, conn)
cell = cell.iat[0,0]
country_code
# SQL querying for relevant columns, subject to parameters
= \
cmd2 f"""
SELECT S.name, S.latitude, S.longitude, '{country}' Country, T.year, T.month, T.temp
FROM temperatures T
LEFT JOIN stations S ON T.id = S.id
WHERE (SUBSTRING(T.id, 1, 2) = '{country_code}')
AND (T.year BETWEEN {year_begin} AND {year_end})
AND (T.month = {month})
"""
= pd.read_sql(cmd2, conn)
df
# Closing connection and returning dataframe
conn.close()return df
We test our function here with "India"
, over the time period 1980-2020 for the month of January.
"temps.db", 'India', 1980, 2020, 1) query_climate_database(
NAME | LATITUDE | LONGITUDE | Country | Year | Month | Temp | |
---|---|---|---|---|---|---|---|
0 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1980 | 1 | 23.48 |
1 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1981 | 1 | 24.57 |
2 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1982 | 1 | 24.19 |
3 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1983 | 1 | 23.51 |
4 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1984 | 1 | 24.81 |
... | ... | ... | ... | ... | ... | ... | ... |
3147 | DARJEELING | 27.050 | 88.270 | India | 1983 | 1 | 5.10 |
3148 | DARJEELING | 27.050 | 88.270 | India | 1986 | 1 | 6.90 |
3149 | DARJEELING | 27.050 | 88.270 | India | 1994 | 1 | 8.10 |
3150 | DARJEELING | 27.050 | 88.270 | India | 1995 | 1 | 5.60 |
3151 | DARJEELING | 27.050 | 88.270 | India | 1997 | 1 | 5.70 |
3152 rows × 7 columns
Step 3. Geographic Scatter Function for Yearly Temperature Increases
In this section, we seek to answer our overarching question: > How does the average yearly change in temperature vary within a given country?
To do this, we define a function called temperature_coefficient_plot()
. It takes in the same parameters as query_climate_database
, and uses it to collect data for the selected country/timeframe. It then calculates for each station the average yearly change in temperature, by performing a linear regression using scikit-learn with years on the x-axis and temperature on the y-axis. The details of this process are defined in the helper function coef
. Finally, it takes its calculated slopes and plots each station (along with its latitude, longitude) onto a plotly interactive map, with the stations colorcoded by average yearly change in temperature. It then returns this figure.
def coef(data_group):
'''
Helper function for temperature_coefficient_plot
Inputs: DataFrame containing temperatures over a year period
Outputs: Average yearly change in temperature (as calculated by a linear regression)
'''
= data_group[["Year"]] # Dataframe format
X = data_group['Temp'] # Series format
y = LinearRegression()
LR # Calculates the coefficients
LR.fit(X,y) = LR.coef_[0] # Takes the slope (average yearly change in temp) from the model.
slope = round(slope, 4)
slope return slope
def temperature_coefficient_plot(db_file, country, year_begin, year_end, month, min_obs, **kwargs):
'''
A function that calculates the average yearly changes in temperature for stations in a certain country
over a certain time period over a certain month.
Inputs:
db_file (climate database), country (str), year_begin, year_end (int), month (int),
min_obs (int, the minimum number of years a station tracks data within the timeframe),
**kwargs (other keyword arguments for plotly map parameters)
Outputs:
A plotly figure containing points for stations, their lat/long, and average yearly change in temp.
'''
# Calling query_climate_database with parameters to get necessary data
= query_climate_database(db_file, country, year_begin, year_end, month)
df
# Filtering out stations with less than min_obs
'observations'] = df.groupby('NAME')['Year'].transform('count')
df[= df[df['observations'] > (min_obs-1)]
df
# Grouping by station, calculating avg. yearly change in temp
= df.groupby(['NAME', 'LATITUDE', 'LONGITUDE']).apply(coef)
coefs = coefs.reset_index()
coefs 'Estimated Yearly Increase (°C)'] = coefs.iloc[:, 3]
coefs[# coefs is now a DataFrame with columns for the station, lat, long, and avg. change in temp
# Plotting stations, coloring by avg. yearly change in temp
= px.scatter_mapbox(coefs,
fig = "LATITUDE",
lat = "LONGITUDE",
lon = "NAME",
hover_name = "Estimated Yearly Increase (°C)",
color = 0,
color_continuous_midpoint **kwargs) # now changing the style, this one is more low-contrast, keeps the coordinates out
=f"Estimates for Average Yearly Increase in Temperature (°C) for stations in {country}, years {year_begin}-{year_end}",
fig.update_layout(title=dict(size=10),
font={"r":0,"t":20,"l":0,"b":0})
margin
return fig
Here we test our function by plotting the temperature changes in India (1980-2020 in January) and China (1995-2022 in July).
= px.colors.diverging.RdGy_r
color_map = temperature_coefficient_plot("temps.db", 'India', 1980, 2020, 1,
fig = 10,
min_obs = 2,
zoom ="open-street-map",
mapbox_style=color_map)
color_continuous_scale fig.show()
= temperature_coefficient_plot("temps.db", 'China', 1995, 2022, 7,
fig = 10,
min_obs = 2,
zoom ="open-street-map",
mapbox_style=color_map)
color_continuous_scale fig.show()
It seems we can draw some conclusions that over the past few decades, controlling for month/season, temperature has been rising at a fairly steady rate over the majority of the country for both India and China, as evidenced by the predominance of red/light red dots.